# !/usr/bin/env python
# -*- coding: utf-8 -*-
# @File  : 低方差过滤法.py
# @Author: dongguangwen
# @Date  : 2025-02-13 22:04
from sklearn.feature_selection import VarianceThreshold
import pandas as pd

# 1. 读取数据集
data = pd.read_csv('./data/垃圾邮件分类数据.csv')
print(data.head(5))
print(data.shape)

# 2. 使用方差过滤法
transformer = VarianceThreshold(threshold=0.1)
data = transformer.fit_transform(data)
print(data.shape)

"""
   Unnamed: 0  0  1  2  3  4  ...  25727  25728  25729  25730  25731  25732
0           0  0  0  0  0  0  ...      0      0      0      0      0      1
1           1  0  0  0  0  0  ...      0      0      0      0      0      1
2           2  0  0  0  0  0  ...      0      0      0      0      0      1
3           3  0  0  0  0  0  ...      0      0      0      0      0      1
4           4  0  0  0  0  0  ...      0      0      0      0      0      1

[5 rows x 25734 columns]
(971, 25734)
(971, 1044)
"""